Create a histogram that shows the distribution of characters in a string. Use this function to draw a bar chart for the letters in your first name.

library(ggplot2)
 string = "RAJKANWAR"
 char_count = data.frame(table(strsplit(string, "")[[1]]))
 colnames(char_count) = c("char", "count")
 ggplot(char_count, aes(x = char, y = count)) +
   geom_bar(stat = "identity", aes(fill = char)) +
   scale_fill_brewer(palette = "Set1") +
   ggtitle("Distribution of characters in first name")

Dendogram on US Arrests

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)
library(ggdendro)
hc1 <- hclust(dist(USArrests), "ave")
p1 <- ggdendrogram(hc1, rotate = FALSE, size = 2)
ggplotly(p1)

Draw the linked view for Cloud multivariate dataset (available in UCI repository).

# Load the library for data visualization
library(ggplot2)
# Load the cloud dataset from UCI repository
idata <- read.csv("cloud.csv", header = TRUE)
# Plot the linked view for the multivariate dataset using ggplot2
ggplot(data = idata, aes(x = Visible_mean, y = IR_mean, color = contrast)) + 
  geom_point() + 
  geom_smooth(method = "lm") + 
  xlab("Visible Mean") + 
  ylab("IR Mean") + 
  ggtitle("Linked View for Cloud Multivariate Dataset: Visible VS IR Mean")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Draw a graph matrix for Image Segmentation multivariate dataset (available in UCI repository).

data <- read.csv("segmentation.csv", header = TRUE)
# Next, we'll use k-means clustering to group the data into clusters
library(stats)
kmeans_result <- kmeans(data, centers = 5)
clusters <- kmeans_result$cluster
# We'll use the 'ggplot2' package to plot the graph matrix
library(ggplot2)
ggplot(data, aes(x = 1, y = 2, color = as.factor(clusters))) + 
  geom_point(size = 3) + 
  scale_color_discrete(name = "Cluster") + 
  xlab("") + ylab("") + 
  ggtitle("Graph Matrix using K-Means Clustering")

Draw exploratory graphics of a stock market Dataset.

library(ggplot2)
# Load the dataset
library(tidyquant)
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
data <- tq_get("AAPL", get = "stock.prices")
# Plot a scatter plot of the closing price and volume
ggplot(data, aes(x = close, y = volume)) +
  geom_point(color = "red") +
  ggtitle("Scatter Plot of Closing Price and Volume")

# Plot a line graph of the closing price over time
ggplot(data, aes(x = date, y = close)) +
  geom_line(color = "blue") +
  ggtitle("Closing Price of Apple Stock Over Time")

# Plot a histogram of the daily return
ggplot(data, aes(x = adjusted)) +
  geom_histogram(binwidth = 1, fill = "green") +
  ggtitle("Histogram of Daily Returns")

US Arrests K-means clustering

library(ggpubr)
library(ggplot2)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Load the USArrests dataset
data("USArrests")
df <- USArrests
# Compute k-means with k = 3
set.seed(123)
res.km <- kmeans(scale(df), 3, nstart = 25)
# K-means clusters showing the group of each state
res.km$cluster
##        Alabama         Alaska        Arizona       Arkansas     California 
##              1              1              1              3              1 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              1              3              3              1              1 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              3              2              1              3              2 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              3              2              1              2              1 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              3              1              2              1              1 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              2              2              1              2              3 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              1              1              1              2              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              3              3              3              3              1 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              2              1              1              3              2 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              3              3              2              2              3
fviz_cluster(res.km, data = df,
            palette = c("#E69F00", "#56B4E9", "#009E73"), 
             geom = "point",
             ellipse.type = "convex", 
             ggtheme = theme_bw()
)

# Dimension reduction using PCA
res.pca <- prcomp(df, scale = TRUE)
# Coordinates of states
ind.coord <- as.data.frame(get_pca_ind(res.pca)$coord)
# Add clusters obtained using the K-means algorithm
ind.coord$cluster <- factor(res.km$cluster)
# Percentage of variance explained by dimensions
eigenvalue <- round(get_eigenvalue(res.pca), 1)
variance.percent <- eigenvalue$variance.percent

Visualize k-nearest-neighbor search, on Ranking in spatial dataset, using D3 quadtrees.

library(e1071)
## 
## Attaching package: 'e1071'
## The following objects are masked from 'package:PerformanceAnalytics':
## 
##     kurtosis, skewness
library(caTools)
library(class)
library(ggplot2)
data(iris)
head(iris)
# Split data 
split <- sample.split(iris, SplitRatio = 0.7)
train_cl <- subset(iris, split == "TRUE")
test_cl <- subset(iris, split == "FALSE")
# Feature Scaling
train_scale <- scale(train_cl[, 1:4])
test_scale <- scale(test_cl[, 1:4])
# Fitting KNN Model to training dataset
classifier_knn <- knn(train = train_scale,
                      test = test_scale,
                      cl = train_cl$Species,
                      k = 1)
classifier_knn
##  [1] setosa     setosa     setosa     setosa     setosa     setosa    
##  [7] setosa     setosa     setosa     setosa     setosa     setosa    
## [13] setosa     setosa     setosa     setosa     setosa     setosa    
## [19] setosa     setosa     versicolor virginica  versicolor versicolor
## [25] versicolor versicolor versicolor versicolor virginica  versicolor
## [31] virginica  versicolor versicolor versicolor versicolor versicolor
## [37] versicolor versicolor versicolor versicolor virginica  virginica 
## [43] virginica  virginica  virginica  virginica  virginica  versicolor
## [49] virginica  virginica  virginica  virginica  virginica  versicolor
## [55] virginica  virginica  virginica  virginica  virginica  virginica 
## Levels: setosa versicolor virginica
# Confusiin Matrix
cm <- table(test_cl$Species, classifier_knn)
cm
##             classifier_knn
##              setosa versicolor virginica
##   setosa         20          0         0
##   versicolor      0         17         3
##   virginica       0          2        18
# Model Evaluation - Choosing K 
# Calculate out of Sample error
misClassError <- mean(classifier_knn != test_cl$Species)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.916666666666667"
# K = 3
classifier_knn <- knn(train = train_scale,
                      test = test_scale,
                      cl = train_cl$Species,
                      k = 3)
misClassError <- mean(classifier_knn != test_cl$Species)
# K = 5
classifier_knn <- knn(train = train_scale,
                      test = test_scale,
                      cl = train_cl$Species,
                      k = 5)
misClassError <- mean(classifier_knn != test_cl$Species)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.966666666666667"
# K = 7
classifier_knn <- knn(train = train_scale,
                      test = test_scale,
                      cl = train_cl$Species,
                      k = 7)
misClassError <- mean(classifier_knn != test_cl$Species)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.966666666666667"
# K = 15
classifier_knn <- knn(train = train_scale,
                      test = test_scale,
                      cl = train_cl$Species,
                      k = 15)
misClassError <- mean(classifier_knn != test_cl$Species)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.966666666666667"
# K = 19
classifier_knn <- knn(train = train_scale,
                      test = test_scale,
                      cl = train_cl$Species,
                      k = 19)
misClassError <- mean(classifier_knn != test_cl$Species)
print(paste('Accuracy =', 1-misClassError))
## [1] "Accuracy = 0.966666666666667"
#Visiualisation
ggplot(iris, aes(Petal.Length, Petal.Width, color = Species)) +
  geom_point(size = 3) +
  ggtitle("Iris Data Set") +
  labs(x = "Petal Length", y = "Petal Width", color = "Species") +
  theme_bw() +
  geom_point(data = test_cl, aes(Petal.Length, Petal.Width, color = classifier_knn), size = 15, shape = 1)

Implement data visualization using dendrogram.

# Load data
data(mtcars)
# Compute distances and hierarchical clustering
dd <- dist(scale(mtcars), method = "euclidean")
#Ward_D2 means Instead of measuring the distance directly, 
#it analyzes the variance of clusters
hc <- hclust(dd, method = "ward.D2")
library(factoextra)
fviz_dend(hc, cex = 0.5)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_dend(hc, cex = 0.5, 
          main = "Dendrogram - ward.D2",
          xlab = "Objects", ylab = "Distance", sub = "")

#fviz_dend(hc, cex = 0.5, horiz = TRUE)
fviz_dend(hc, k = 4, cex=0.5,  k_colors = c("blue", "green", "red", "black"),                # Cut in four groups
          color_labels_by_k = TRUE, ggtheme = theme_gray() )

Visualize functional data with an application to eBay’s online auctions.

# Load required packages
library(fda)
## Loading required package: splines
## Loading required package: fds
## Loading required package: rainbow
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## Loading required package: pcaPP
## Loading required package: RCurl
## Loading required package: deSolve
## 
## Attaching package: 'fda'
## The following object is masked from 'package:graphics':
## 
##     matplot
library(fda.usc)
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.8-42. For overview type 'help("mgcv-package")'.
##  fda.usc is running sequentially usign foreach package
##  Please, execute ops.fda.usc() once to run in local parallel mode
##  Deprecated functions: min.basis, min.np, anova.hetero, anova.onefactor, anova.RPm
##  New functions: optim.basis, optim.np, fanova.hetero, fanova.onefactor, fanova.RPm
## ----------------------------------------------------------------------------------
# Load Shill Bidding Dataset
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00562/Shill%20Bidding%20Dataset.csv"
shill <- read.csv(url, header = TRUE)
# Drop non-relevant variables
shill <- shill[, c("Auction_Bids", "Auction_Duration")]
# Sort the data by Auction_Bids
shill <- shill[order(shill$Auction_Bids),]
# Create a functional data object from the Auction_Duration variable using B-splines
basis <- create.bspline.basis(rangeval = range(shill$Auction_Bids), nbasis = 10)
fd_shill <- smooth.basis(shill$Auction_Bids, shill$Auction_Duration, basis)
# Plot the functional data object
plot(fd_shill, xlab = "Number of Bids", ylab = "Auction Duration",
     main = "Relationship between Bids and Auction Duration in eBay Auctions",
     col = "red", lwd = 2)

## [1] "done"

Show graphical data representation in classification using Iris dataset

# Load required libraries
library(ggplot2)
library(datasets)
library(reshape2)
# Load iris dataset
data(iris)
# Define custom color palette
my_colors <- c("#E69F00", "#56B4E9", "#009E73")
# Scatterplot with regression line
ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
  geom_point(color = my_colors[1]) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Sepal Length", y = "Sepal Width", title = "Iris Dataset with Regression Line") +
  theme_classic()
## `geom_smooth()` using formula = 'y ~ x'

# Boxplot of Sepal length by Species
ggplot(iris, aes(Species, Sepal.Length)) +
  geom_boxplot(fill = my_colors[2]) +
  labs(x = "Species", y = "Sepal Length", title = "Iris Dataset: Sepal Length by Species") +
  theme_classic()

# Histogram of Petal width by Species
ggplot(iris, aes(Petal.Width, fill = Species)) +
  geom_histogram(alpha = 0.5, bins = 30) +
  scale_fill_manual(values = my_colors) +
  labs(x = "Petal Width", y = "Count", title = "Iris Dataset: Petal Width by Species") +
  theme_classic()

# Stacked bar chart of Petal length by Species
ggplot(iris, aes(Species, Petal.Length, fill = Species)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = my_colors) +
  labs(x = "Species", y = "Petal Length", title = "Iris Dataset: Petal Length by Species") +
  theme_classic()

# Heatmap of correlations between variables in iris dataset
iris_cor <- round(cor(iris[,1:4]), 2) # Calculate correlation matrix
ggplot(data = melt(iris_cor), aes(Var2, Var1, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = my_colors[1], high = my_colors[3], mid = "white",
                       midpoint = 0, limit = c(-1,1), space = "Lab",
                       name="Correlation") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1,
                                   size = 10, hjust = 1)) +
  labs(x = "", y = "", title =
         "Correlations between variables in Iris dataset")

Draw a graph matrix for mushroom dataset (available in UCI repository).

# Load the necessary packages
library(readr)
# Load the mushroom dataset
mushrooms <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data",
                      col_names = FALSE)
## Rows: 8124 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (22): X1, X2, X3, X4, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16,...
## lgl  (1): X5
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
# Draw the graph matrix
ggpairs(mushrooms, aes(colour = mushrooms$X1))